In this project we are going to clean, investigate, explore and analyse IMDB Movies Dataset for correlations as well as extract some insights and trends from the valid data. The main objective of this project is to answer 5 questions related to this dataset.
# Importing libraries in use
import pandas as pd
import numpy as np
import datetime as dt
import sklearn.preprocessing
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
# Defining ecdf Function for Cumilative Distribution Function
def ecdf(data):
#credits DataCamp Justin Bois
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n+1) / n
return x, y
# Importing the dataset using pandas
df = pd.read_csv('https://d17h27t6h515a5.cloudfront.net/topher/2017/October/59dd1c4c_tmdb-movies/tmdb-movies.csv')
display(df.head())
display(df.info())
display(df.describe())
For targeted analysis in question, certain columns are irrelevent and pose problems with missing data, thus dropping said columns is preferable for the analysis process
# Dropping columns that pose problems and are irrelavent
columns_todrop = ['id','imdb_id','homepage','tagline','keywords','production_companies','budget_adj','revenue_adj']
df.drop(columns_todrop,axis=1,inplace=True)
# Dropping missing data to further clean the set
df.dropna(inplace=True)
Reformatting cast, genere and directors columns as they contain a concatenated list of elements that needs to be split into a list
# splitting strings and converting them to strings
for i in df.index:
df.at[i,'cast'] = df.cast[i].split(sep='|')
df.at[i,'genres'] = df.genres[i].split(sep='|')
df.at[i,'director'] = df.director[i].split(sep='|')
# converting date to datetime format
df.release_date = pd.to_datetime(df.release_date, format='%m/%d/%y')
# fixing pandas datetime issue with dates formated as %YY befor 1969
df['release_date'] = df['release_date'].mask(df['release_date'].dt.year > 2015,
df['release_date'] - pd.Timedelta(36500, unit = 'D'))
# resetting index
df.reset_index(inplace=True,drop=True)
display(df.info())
display(df.head())
df.corr().style.background_gradient(cmap='Blues')
# using plot.ly dynamic library to plot the relationship between revenue and budget with popularity as the third dimension (color)
# subsetting only successful movies where revenue > budget
df_rev = df[df.revenue > df.budget]
fig = px.scatter(data_frame=df_rev,x='revenue',y='budget',color='popularity',hover_data=['original_title'],title = 'Relationship between Revenue & Budget')
fig.show()
#Using ecdf to compute the CDF
x1,y1 = list(ecdf(df_rev.budget))
x,y = list(ecdf(df_rev.revenue))
#Create a subplot to fit two axis
fig = make_subplots(rows=1, cols=2,subplot_titles=(f'Cumulative distribution function of Budget', f'Cumulative distribution function of revenue'))
#add first plot of Budget
fig.add_trace(
go.Scatter(x= x1,y = y1,name = f'CDF of Budget'),
row=1, col=1
)
#add second plot of Revenue
fig.add_trace(
go.Scatter(x = x,y = y,name = f'CDF of Revenue'),
row=1, col=2
)
#control title and figure dimentions
fig.update_layout(height=500, width=1000, title_text="Cumulative distribution functions")
fig.show()
# Subsetting top 10 most popular movies
df_temp_gen = df.nlargest(10,'popularity').explode('genres')
# Counting genres within top 10 popular movies
df_temp_gen['count'] = df_temp_gen.groupby('genres')['genres'].transform('count')
# Sorting and removing duplicates
top_genres = df_temp_gen[['genres','count']].drop_duplicates().sort_values('count',ascending=False).nlargest(10,'count').reset_index()
# Displaying results
display(top_genres[['genres','count']])
# Grouping df by genres by each year in TEMP df
df_temp = df.explode('genres')
df_temp['count'] = df_temp.groupby('genres')['genres'].transform('count')
# Listing top 10 most common genres of all time
top_gen = df_temp[['genres','count']].drop_duplicates().sort_values('count',ascending=False).nlargest(10,'count').reset_index()
# Subsetting dataframe to contain only used columns
df_temp = df_temp[['release_year','genres','count']].groupby(['release_year','genres']).count().reset_index()
# Subsetting dataframe to contain only most common genres of all time
df_temp = df_temp[df_temp['genres'].isin(top_gen['genres'])]
# Plotting relationship
fig = px.line(df_temp,x = 'release_year', y = 'count',color = 'genres',title=f"Change in top 10 genres' popularity over time")
fig.show()
# Subsetting successful movies
df_rev = df[df.revenue > df.budget]
# Plotting the relationship between revenue and popularity with budget as the third dimension (color)
fig = px.scatter(data_frame=df_rev,x='revenue',y='popularity',color='budget',hover_data=['original_title'],title = 'Relationship between Revenue & Popularity')
fig.show()
#Using ecdf to compute the CDF
x1,y1 = list(ecdf(df_rev.popularity))
x,y = list(ecdf(df_rev.revenue))
#Create a subplot to fit two axis
fig = make_subplots(rows=1, cols=2,subplot_titles=(f'Cumulative distribution function of Popularity', f'Cumulative distribution function of revenue'))
#add first plot of Popularity
fig.add_trace(
go.Scatter(x= x1,y = y1,name = f'CDF of Popularity'),
row=1, col=1
)
#add second plot of Revenue
fig.add_trace(
go.Scatter(x = x,y = y,name = f'CDF of Revenue'),
row=1, col=2
)
#control title and figure dimentions
fig.update_layout(height=500, width=1000, title_text="Cumulative distribution functions")
fig.show()
# Subsetting and exploding 'cast' lists
df_temp2 = df.explode('cast')
# Grouping and counting each actor occurence in a movie
df_temp2['count'] = df_temp2.groupby('cast')['cast'].transform('count')
# Sorting and removing duplicates
df_temp2 = df_temp2[['cast','count']].drop_duplicates().sort_values('count',ascending=False).reset_index().nlargest(10,'count')
# Displaying results
display(df_temp2[['cast','count']])
# Subsetting and exploding 'cast' lists
df_temp3 = df.explode('director')
# Grouping and counting each director occurence in a movie
df_temp3['count'] = df_temp3.groupby('director')['director'].transform('count')
# Sorting and removing duplicates
df_temp3 = df_temp3[['director','count']].drop_duplicates().sort_values('count',ascending=False).reset_index().nlargest(10,'count')
# Displaying results
display(df_temp3[['director','count']])